set -e
set -x

# Convert model from huggingface to megatron
export WORLD_SIZE="1"
export RANK="0"
export KUBERNETES_CONTAINER_RESOURCE_GPU="8"
export MASTER_ADDR="192.169.114.10"
export MASTER_PORT="11130"
ENV="dsw"  # dlc: multiple/dsw: single
MODEL_TYPE="qwen1.5-14b"
MODEL_SIZE="14B"
BATCH_SIZE="4"
GLOBAL_BATCH_SIZE="1152"
LR="2e-5"
MIN_LR="1e-6"
SEQ_LEN="512"
PAD_LEN="512"
EXTRA_VOCAB_SIZE="421"         # 词表扩充大小 (qwen1.5-14b/32b:421)
PR="fp16"                        # 训练精度: fp16, bf16
TP="1"                        # 模型并行度
PP="8"                        # 流水并行度
AC="sel"                        # 激活检查点模式: sel, full
DO="true"                        # 是否使用Megatron版Zero-1降显存优化器: true, false
FL="true"                        # 是否使用Flash Attention: true, false
SP="false"                        # 是否使用序列并行: true, false
TE="false"                        # 是否使用Transformer Engine: true, false
SAVE_INTERVAL="1000"             # 保存ckpt的间隔
DATASET_PATH="/mnt/home/opsfm-xz/data/megatron/zedxsftqatoken10xumac_1.0_3000000_train.json"              # 训练数据集路径
VALID_DATASET_PATH="/mnt/home/opsfm-xz/data/megatron/zedxsftqatoken10xumac_1.0_3000000_valid.json"        # 验证数据集路径
# DATASET_PATH="/mnt/tenant-home_speed/xz/Pai-Megatron-Patch/qwen-datasets/alpaca_zh-qwen-train.json"
# VALID_DATASET_PATH="/mnt/tenant-home_speed/xz/Pai-Megatron-Patch/qwen-datasets/alpaca_zh-qwen-valid.json"
PRETRAIN_CHECKPOINT_PATH="/mnt/tenant-home_speed/xz/opsfm-xz/models/Qwen/Qwen1.5-14B-Chat-megatron-tp${TP}-pp${PP}"  # 预训练模型路径
TRAIN_ITERS="10000"               # 训练step数
WARMUP_ITERS="20"              # 预热step数
OUTPUT_BASEPATH="/mnt/tenant-home_speed/xz/opsfm-xz/sft_checkpoint/xz/Qwen1.5-14B-Chat-megatron-tp${TP}-pp${PP}-umac-token10x-sft-10000step"           # 训练输出文件路径

cd /mnt/tenant-home_speed/xz/Pai-Megatron-Patch/examples/qwen1_5
source /root/miniconda3/etc/profile.d/conda.sh && conda activate megatron
# export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
export OMP_NUM_THREADS=1
bash run_finetune_megatron_qwen_withGA.sh "${ENV}" ../../ "${MODEL_SIZE}" "${BATCH_SIZE}" "${GLOBAL_BATCH_SIZE}" "${LR}" "${MIN_LR}" "${SEQ_LEN}" "${PAD_LEN}" "${EXTRA_VOCAB_SIZE}" "${PR}" "${TP}" "${PP}" "${AC}" "${DO}" "${FL}" "${SP}" "${TE}" "${SAVE_INTERVAL}" "${DATASET_PATH}" "${VALID_DATASET_PATH}" "${PRETRAIN_CHECKPOINT_PATH}" "${TRAIN_ITERS}" "${WARMUP_ITERS}" "${OUTPUT_BASEPATH}"
